package com.ml4ai.backend.services.impl;

import com.ml4ai.backend.services.EsService;
import com.ml4ai.backend.services.WebCatchService;
import com.ml4ai.backend.utils.StringHelper;
import lombok.Cleanup;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import java.io.File;
import java.io.FileOutputStream;
import java.net.URLEncoder;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * Created by leecheng on 2018/10/19.
 */
@Service
@Slf4j
public class WebCatchServiceImpl implements WebCatchService {

    @Autowired
    EsService esService;

    ExecutorService executorService = Executors.newFixedThreadPool(12);

    AtomicInteger counter = new AtomicInteger(-1);

    @Override
    @SneakyThrows
    public Future<Boolean> catchPage(String taskId, List<String> seedUrl, String... regex) {
        WebPageGet webPageGet = new WebPageGet(taskId, (page, next) -> {
            Map<String, Object> data = new LinkedHashMap<>();
            String url = page.url();
            byte[] b = page.content();
            String title = null;
            if (!page.select("title").isEmpty()) {
                title = page.select("title").get(0).text();
            }
            String esId = StringHelper.md5(url);
            data.put("esId", esId.toUpperCase());
            data.put("url", url);
            data.put("title", title);
            data.put("html", b);
            esService.putJson("web_page", "default", esId.toUpperCase(), data);
            page.select("img").forEach(img -> {
                        String fromUrl = url;
                        executorService.execute(() -> {
                            try {
                                String baseURI = img.baseUri();
                                String originSrc = img.attr("src");
                                String src = originSrc;
                                if (src.toUpperCase().startsWith("HTTP://") || src.toUpperCase().startsWith("HTTPS://")) {

                                } else {
                                    //SRC通过baseURI和originSrc拼接
                                    src = baseURI;
                                    if (!src.endsWith("/")) {
                                        src += "/";
                                    }

                                    if (!originSrc.startsWith("/")) {
                                        src += originSrc;
                                    } else {
                                        int oneIndex = src.indexOf("/");
                                        int twoIndex = src.indexOf("/", oneIndex + 1);
                                        int threeIndex = src.indexOf("/", twoIndex + 1);
                                        src = src.substring(0, threeIndex) + originSrc;
                                    }
                                }
                                HttpClient client = HttpClients.createDefault();
                                HttpGet get = new HttpGet(src);
                                @Cleanup CloseableHttpResponse response = (CloseableHttpResponse) client.execute(get);
                                byte[] imgData = org.apache.commons.io.IOUtils.toByteArray(response.getEntity().getContent());
                                Map<String, Object> imgJson = new LinkedHashMap<>();
                                String id = StringHelper.md5(src);
                                imgJson.put("esId", id.toUpperCase());
                                imgJson.put("from", fromUrl);
                                imgJson.put("baseURI", baseURI);
                                imgJson.put("originSrc", originSrc);
                                imgJson.put("url", src);
                                imgJson.put("content", imgData);

                                log.info("download:" + src);
                                //String save = "d:/doc/data/image/" + URLEncoder.encode(fromUrl.split("\\?")[0], "UTF-8") + "/";
                                //File saveFile = new File(save);
                                //if (!saveFile.exists()) {
                                //    saveFile.mkdirs();
                                //}
                                //@Cleanup FileOutputStream fos = new FileOutputStream(save + counter.incrementAndGet() + "_" + id.toUpperCase() + ".jpg");
                                //fos.write(imgData);
                                //fos.flush();
                                esService.putJson("web_image", "default", id.toUpperCase(), imgJson);
                            } catch (Exception e) {
                                e.printStackTrace();
                                log.info("发生严重错误！");
                            }
                        });
                    }
            );
        }, true, seedUrl, regex);
        webPageGet.start(Integer.MAX_VALUE);
        return null;
    }


}
